*******************************************************************************
**
** This script cleans the raw HCRIS files and constructs a hospital-year panel 
** Requirements: 
** 	- Downloaded HCRIS files in input/  
** 	- input/lookup_20210809.xlsx: An excel spreadsheet with desired variables to read 
** 
*******************************************************************************



** specify the input directory
** this should contain the raw HCRIS files and lookup.xlsx
local fpath_input = "/homes/nber/shruthi-dua51934/sacarny-DUA51934/shruthi-dua51934/replication_files/build/hcris/input"

** specify the output directory 
local fpath_output = "/homes/nber/shruthi-dua51934/sacarny-DUA51934/shruthi-dua51934/replication_files/build/hcris/output"


set more off
capture log close
clear
log using "`fpath_output'/hcris.log", replace

* use cost reports from these years' data files
global STARTYEAR = 2000
global ENDYEAR = 2017

* with files from those years, we often (~50% of hospitals) don't observe
* reports covering the full calendar STARTYEAR. and often don't observe (~20%
* of hospitals) reports covering the full calendar ENDYEAR.
* so we shrink the extents of the hospital-year file.
global STARTYEAR_HY = $STARTYEAR+1
global ENDYEAR_HY = $ENDYEAR-1

* make the output folder if it doesn't exist
capture mkdir output

* import the lookup table
* edit below to change location of the lookup table 
import excel using "`fpath_input'/lookup_20210809.xlsx", firstrow clear
keep if enabled == 1 
cap drop G 
cap drop H 
save "`fpath_input'/lookup_20210809.dta", replace 
tempfile lookup
save `lookup'


* process each year's cost report
forvalues year=$STARTYEAR/$ENDYEAR {

	if (`year'>= 2012) {
		local formats "10"
	}
	else if (`year'==2010 | `year'==2011) {
		local formats "96 10"
	}
	else {
		local formats "96"
	}
	
	foreach fmt in `formats' {

		// process numeric file to pull in components of margins

		display "processing hosp_nmrc2552_`fmt'_`year'_long.dta"
	
		use rpt_rec_num itm_val_num wksht_cd clmn_num line_num ///
			if inlist(wksht_cd,"G300000","G200000","S100000","S300001", "C000001", "E00A18A", "A000000", "A700002", "A700001") ///
			using "`fpath_input'/hosp_nmrc2552_`fmt'_`year'_long.dta", clear 
	
		// identify the variables we need and label the rows		
		// merge with the lookup table
		gen fmt = `fmt'
		merge m:1 wksht_cd clmn_num line_num fmt using `lookup', ///
			keep(match) nogenerate
		
		// drop entries we've disabled
		drop if enabled==0
		
		drop enabled fmt

		keep rpt_rec_num rec itm_val_num
		
		// total overhead expenses  (sums salaries + other)
		collapse (sum) itm_val_num, by(rpt_rec_num rec)
		replace rec = subinstr(rec, " ", "", 1)
		
		// reshape to one row per report
		rename itm_val_num val_
		reshape wide val, i(rpt_rec_num) j(rec) string			
		
		// charity care vars differ between versions. create blank variables		
		if (`fmt'==10 | (`fmt'==96 & `year' < 2002)) {
			foreach var in chguccare {
				gen val_`var' = .
			}
		}
		
		if (`fmt'==96) {
			foreach var in totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 {
				gen val_`var' = .
			}
		}
		
		// zero out $ variables if they were blank
		* don't zero out CCR & beds variables
		quietly desc val_*, varlist
		foreach var in `r(varlist)' {			
			if strpos("`var'", "bed") == 0 & strpos("`var'", "ccr") == 0 & strpos("`var'", "discharges") == 0 {	
				*di "got here"
				replace `var' = 0 if `var'==.
			}
			local newname = subinstr("`var'", "val_", "", 1)	
			rename `var' `newname'		
		}

		
		// save for merging into report level data
		tempfile nmrc
		save `nmrc', replace

		clear
	
		// process report index file
		use "`fpath_input'/hosp_rpt2552_`fmt'_`year'"
		keep rpt_rec_num prvdr_num rpt_stus_cd fy_bgn_dt fy_end_dt proc_dt
	
		// bring in components of margins
		merge 1:1 rpt_rec_num using `nmrc'

		// deal with components that didn't correspond to any records in the index file!
		tab _merge
		qui count if _merge==2
		local Nusing = r(N)
		if (`Nusing'!=0) {
			display "*** there were values for some reports that were never filed! ***"
			display "...dropping `Nusing' records"
			drop if _merge==2
		}

		drop _merge
	
		gen year = `year'
		
		gen fmt = `fmt'
		
		save "`fpath_output'/merged`year'_`fmt'.dta", replace
	}
	
	* some years have multiple files of cost reports because they are reported in
	* 1996 and 2010 formats
	* append together the cost reports for each year and save as one file
	
	clear
	foreach fmt in `formats' {
		if (_N==0) {
			use "`fpath_output'/merged`year'_`fmt'.dta"
		}
		else {
			append using "`fpath_output'/merged`year'_`fmt'.dta"
		}
		
		*rm output/merged`year'_`fmt'.dta
	}
	
	save "`fpath_output'/merged`year'_`fmt'.dta", replace
	clear

}


* append together the cost reports for all the years
forvalues year=$STARTYEAR/$ENDYEAR {
	if (_N==0) {
		use "`fpath_output'/merged`year'.dta"
	}
	else {
		append using "`fpath_output'/merged`year'.dta"
	}
	
	rm "`fpath_output'/merged`year'.dta"

}

** Make sure that rpt_rec_num uniquely identifies each row 
isid rpt_rec_num
*isid rpt_rec_num year 

rename prvdr_num pn

gen income = netpatrev + othinc
gen totcost = opexp + othexp

gen margin = (income-totcost)/income

gen uccare_chg_harmonized = chguccare if fmt==96
replace uccare_chg_harmonized = totinitchcare-ppaychcare+nonmcbaddebt if fmt==10

gen uccare_cost_harmonized = ccr*uccare_chg_harmonized

sort pn year fy_bgn_dt fmt
order rpt_rec_num pn year fy_bgn_dt fmt



* create cost component subtotals
egen cost_gen = rowtotal(gen_*)



label data "cms hospital cost report data"

label var year "year"
label var fmt "report format (96=1996 10=2010)"
label var beds_adultped "beds - adults & peds"
label var availbeddays_adultped "bed days available in rpt period"
label var ipbeddays_adultped "inpatient bed days utilized"
label var ipdischarges_adultped "inpatient discharges"
label var beds_totadultped "beds - total adults & peds incl swing beds"
label var beds_total "beds - total (inc swing + spec care beds e.g. icu, ccu, nicu)"
label var donations "donations"
label var invinc "investment income"
label var iphosprev "inpatient hospital revenue"
label var ipgenrev "inpatient general revenue (total of hosp, ipf, irf, snf, etc.)"
label var ipicrev "inpatient intensive care type revenue (total of icu, ccu, etc.)"
label var iprcrev "inpatient routine care revenue (sum of ipgenrev and ipicrev)"
label var ipancrev "inpatient ancillary services revenue"
label var ipoprev "inpatient outpatient services revenue"
label var iptotrev "inpatient total patient revenue"
label var opancrev "outpatient ancillary services revenue"
label var opoprev "outpatient outpatient services revenue"
label var optotrev "outpatient total patient revenues"
label var tottotrev "total patient revenue (sum of iptotrev and optotrev)"
label var ccr "cost to charge ratio"
label var chguccare "other uncompensated care charges (1996 format only)"
label var totinitchcare "total initial obligation of patients for charity care (2010 format only)"
label var ppaychcare "partial payment by patients approved for charity care (2010 format only)"
label var nonmcbaddebt "non-medicare & non-reimbursable medicare bad debt expense (2010 format only)"
label var costuccare_v2010 "cost of uncompensated care (2010 format only)"
label var netpatrev "net patient revenues (total revenues minus allowances & discounts)"
label var othinc "other income"
label var opexp "total operating expenses"
label var othexp "total other expenses"
label var income "total income (sum of netpatrev and othinc)"
label var totcost "total cost (sum of opexp and othexp)"
label var margin "total all-payer margin i.e. profit margin (income-totcost)/income"
label var uccare_chg_harmonized "uncompensated care charges (harmonized across formats)"
label var uccare_cost_harmonized "uncompensated care costs (harmonized across formats)"
label var ipcharge "inpatient charges"
label var opcharge "outpatient charges" 
label var ppamount "primary payer amount" 
label var totampayable "total amount payable" 
label var totdischarge "total discharges" 
label var medicare_discharge "medicare discharges"
label var capinv_tot "total capital investment"
label var cost_gen "total general service costs"
label var gen_admin "admin costs"
label var gen_anesth "non-physician anesth. benefits"
label var gen_cafeteria "cafeteria cost"
label var gen_cap_building "building capital expenses"
label var gen_cap_equipment "equipment capital expenses"
label var gen_css "central services + supply costs"
label var gen_dietary "dietary expenses (patient)"
label var gen_ebd "EBD expenses"
label var gen_housekeeping "housekeeping costs"
label var gen_internresoth "intern + resident other costs"
label var gen_internressal "intern + resident salaries"
label var gen_laundry "laundry expenses"
label var gen_maintenance "facility maintenance costs"
label var gen_medrec "medical record maintenance costs"
label var gen_nurseadmin "nursing admin salaries + benefits"
label var gen_nursesch "nursing school expenses"
label var gen_paramed "paramedical ed program expenses"
label var gen_personnel "personnel maintenance (room + board)"
label var gen_pharmacy "pharmacy costs"
label var gen_plant "plant operating costs"
label var gen_socservices "social service expenses"



order ///
	rpt_rec_num pn year fmt fy_bgn_dt fy_end_dt rpt_stus_cd proc_dt ///
	beds_adultped beds_totadultped beds_total ///
	availbeddays_adultped ipbeddays_adultped ipdischarges_adultped ///
	income totcost margin ///
	uccare_chg_harmonized uccare_cost_harmonized ///s
	netpatrev othinc opexp othexp donations invinc ///
	iphosprev ipgenrev ipicrev iprcrev ipancrev ipoprev iptotrev ///
	ipcharge opcharge opancrev opoprev optotrev ///
	tottotrev ///
	ccr chguccare totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 ///
	ppamount totampayable totdischarge medicare_discharge cost_* gen_* capinv_tot ///
	


compress

save "`fpath_output'/hcris_merged.dta", replace
export delimited "`fpath_output'/hcris_merged.csv", replace

log close

quietly {
    log using "`fpath_output'/hcris_merged_codebook.txt", text replace
    noisily describe, fullnames
    log close
}

log using "`fpath_output'/hcris.log", append


* now construct the hospital-year synthetic file
* apportion each report to the years it spans

rename year hcris_year

gen year_base = year(fy_bgn_dt)
gen years_spanned = year(fy_end_dt) - year(fy_bgn_dt) + 1
isid rpt_rec_num hcris_year 
expand years_spanned

egen seq = seq(), from(0) by(rpt_rec_num hcris_year)
gen year = year_base+seq
drop year_base seq

* days of the target year covered by the cost report
assert year(fy_bgn_dt)<=year & year(fy_end_dt)>=year
gen first_day_in_year = max(mdy(1,1,year),fy_bgn_dt)
format first_day_in_year %td
gen last_day_in_year = min(mdy(12,31,year),fy_end_dt)
format last_day_in_year %td

gen days_in_year = last_day_in_year-first_day_in_year+1
gen days_spanned = fy_end_dt-fy_bgn_dt+1

* share of the report's days that fell into the target year
gen frac_rpt_in_year = days_in_year/days_spanned
egen totfrac = sum(frac_rpt_in_year), by(rpt_rec_num hcris_year)
assert totfrac==1
drop totfrac

* share of the target year's days that were covered by the report
gen double frac_year_covered = days_in_year/(mdy(12,31,year)-mdy(1,1,year)+1)

* scale the flows by the share of the report that was in the target year
foreach var of varlist ///
	availbeddays_adultped ipbeddays_adultped ipdischarges_adultped ///
	donations invinc netpatrev opexp othexp othinc income totcost ///
	iphosprev ipgenrev ipicrev iprcrev ipancrev ipoprev iptotrev ///
	opancrev opoprev optotrev tottotrev ipcharge opcharge medicare_discharge ///
	chguccare totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 ///
	uccare_chg_harmonized uccare_cost_harmonized cost_* gen_* capinv_tot {
	replace `var' = `var'*frac_rpt_in_year
}

gen nreports = 1
gen nfmt96 = fmt==96
gen nfmt10 = fmt==10

* did report exclude uncompensated care data?
* "Complete lines 17 through 32 for cost reporting periods ending on or after April 30, 2003."
* - Medicare v1996 documentation section 3609.4
gen nno_uncomp = fy_end_dt<mdy(4,30,2003)

* make weighted averages of the CCR & bed counts from the reports
* each report gets weight: fraction of target year covered
foreach var of varlist ccr beds_* {
	egen `var'_wtd = wtmean(`var'), weight(frac_year_covered) by(pn year)
}

* some variables should never be missing
foreach var of varlist ///
	frac_year_covered nreports nfmt96 nfmt10 nno_uncomp ///
	first_day_in_year last_day_in_year ///
	pn year ///
{
	assert !missing(`var')
}

* make missing values flags so we can reset variables to missing after the
* collapse statement
foreach var of varlist ///
	availbeddays_adultped ipbeddays_adultped ipdischarges_adultped ///
	donations invinc netpatrev opexp othexp othinc income totcost ///
	iphosprev ipgenrev ipicrev iprcrev ipancrev ipoprev iptotrev ///
	opancrev opoprev optotrev tottotrev	ipcharge opcharge ///
	ppamount totampayable totdischarge medicare_discharge ///
	chguccare totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 ///
	uccare_chg_harmonized uccare_cost_harmonized cost_* gen_* capinv_tot ///
	ccr_wtd beds_*_wtd {
	generate miss_`var' = missing(`var')
}

* for a couple vars we'll need missing flags for their min and max
foreach var of varlist ///
	ccr beds_total ///
{
	generate miss_`var'_min = missing(`var')
	generate miss_`var'_max = missing(`var')

}

* down to the hospital-year level

collapse ///
	(sum) ///
	availbeddays_adultped ipbeddays_adultped ipdischarges_adultped ///
	donations invinc netpatrev opexp othexp othinc income totcost ///
	iphosprev ipgenrev ipicrev iprcrev ipancrev ipoprev iptotrev ///
	opancrev opoprev optotrev tottotrev ipcharge opcharge ///
	ppamount totampayable totdischarge medicare_discharge /// 
	chguccare totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 ///
	uccare_chg_harmonized uccare_cost_harmonized ///
	frac_year_covered nreports nfmt96 nfmt10 nno_uncomp cost_* gen_* capinv_tot ///
	(min) covg_begin_dt=first_day_in_year ccr_min=ccr beds_total_min=beds_total ///
	(max) covg_end_dt=last_day_in_year ccr_max=ccr beds_total_max=beds_total ///
	(mean) ccr_wtd beds_*_wtd ///
	(max) miss_*, ///
	by(pn year)

* replace variables as missing if any of the embodied observations in the
* collapsed value were missing
foreach missvar of varlist miss_* {
	local basevar = regexr("`missvar'","^miss_","")
	replace `basevar' = . if `missvar'
	drop `missvar'
}

* get rid of years outside the startyear / endyear window
* (some reports in the endyear file run through the following year)
drop if year < $STARTYEAR_HY | year > $ENDYEAR_HY

gen byte flag_short = frac_year_covered<1
gen byte flag_long = frac_year_covered>1

gen margin = (income-totcost)/income

sort pn year

label data "cms hospital cost report data (synthetic calendar year)"

label var year "year"
label var beds_adultped_wtd "beds - adults & peds (weighted avg over reports)"
label var beds_totadultped_wtd "beds - total adults & peds incl swing beds (weighted avg over reports)"
label var beds_total_wtd "beds - total (inc swing + spec care beds e.g. icu, ccu, nicu) (weighted avg over reports)"
label var beds_total_min "beds - total (inc swing + spec care beds e.g. icu, ccu, nicu) (min of reports)"
label var beds_total_max "beds - total (inc swing + spec care beds e.g. icu, ccu, nicu) (max of reports)"
label var availbeddays_adultped "bed days available in rpt period"
label var ipbeddays_adultped "inpatient bed days utilized"
label var ipdischarges_adultped "inpatient discharges"
label var donations "donations"
label var invinc "investment income"
label var iphosprev "inpatient hospital revenue"
label var ipgenrev "inpatient general revenue (total of hosp, ipf, irf, snf, etc.)"
label var ipicrev "inpatient intensive care type revenue (total of icu, ccu, etc.)"
label var iprcrev "inpatient routine care revenue (sum of ipgenrev and ipicrev)"
label var ipancrev "inpatient ancillary services revenue"
label var ipoprev "inpatient outpatient services revenue"
label var iptotrev "inpatient total patient revenue"
label var opancrev "outpatient ancillary services revenue"
label var opoprev "outpatient outpatient services revenue"
label var optotrev "outpatient total patient revenues"
label var tottotrev "total patient revenue (sum of iptotrev and optotrev)"
label var ccr_wtd "cost to charge ratio (weighted avg over reports)"
label var ccr_min "cost to charge ratio (min of reports)"
label var ccr_max "cost to charge ratio (max of reports)"
label var chguccare "other uncompensated care charges (1996 format only)"
label var totinitchcare "total initial obligation of patients for charity care (2010 format only)"
label var ppaychcare "partial payment by patients approved for charity care (2010 format only)"
label var nonmcbaddebt "non-medicare & non-reimbursable medicare bad debt expense (2010 format only)"
label var costuccare_v2010 "cost of uncompensated care (2010 format only)"
label var netpatrev "net patient revenues (total revenues minus allowances & discounts)"
label var othinc "other income"
label var opexp "total operating expenses"
label var othexp "total other expenses"
label var income "total income (sum of netpatrev and othinc)"
label var totcost "total cost (sum of opexp and othexp)"
label var margin "total all-payer margin i.e. profit margin (income-totcost)/income"
label var uccare_chg_harmonized "uncompensated care charges (harmonized across formats)"
label var uccare_cost_harmonized "uncompensated care costs (harmonized across formats)"
label var frac_year_covered "sum of days in reports / days in year"
label var nreports "number of cost reports included in row"
label var nfmt96 "number of 1996 format cost reports included in row"
label var nfmt10 "number of 2010 format cost reports included in row"
label var nno_uncomp "number of cost reports included in row that lack uncompensated care data"
label var covg_begin_dt "first day in year with cost report coverage in row"
label var covg_end_dt "last day in year with cost report coverage in row"
label var flag_short "flag for fewer total days in cost reports than days in year"
label var flag_long "flag for more total days in cost reports than days in year"
label var ipcharge "inpatient charges"
label var opcharge "outpatient charges" 
label var ppamount "primary payer amount" 
label var totampayable "total amount payable" 
label var totdischarge "total discharges" 
label var medicare_discharge "medicare discharges"
label var cost_gen "general service costs"
label var gen_admin "admin costs"
label var gen_anesth "non-physician anesth. benefits"
label var gen_cafeteria "cafeteria cost"
label var gen_cap_building "building capital expenses"
label var gen_cap_equipment "equipment capital expenses"
label var gen_css "central services + supply costs"
label var gen_dietary "dietary expenses (patient)"
label var gen_ebd "EBD expenses"
label var gen_housekeeping "housekeeping costs"
label var gen_internresoth "intern + resident other costs"
label var gen_internressal "intern + resident salaries"
label var gen_laundry "laundry expenses"
label var gen_maintenance "facility maintenance costs"
label var gen_medrec "medical record maintenance costs"
label var gen_nurseadmin "nursing admin salaries + benefits"
label var gen_nursesch "nursing school expenses"
label var gen_paramed "paramedical ed program expenses"
label var gen_personnel "personnel maintenance (room + board)"
label var gen_pharmacy "pharmacy costs"
label var gen_plant "plant operating costs"
label var gen_socservices "social service expenses"
label var capinv_tot "total capital investment"
/*label var cost_ip "inpatient service costs"
label var cost_op "er + clinic costs"
label var cost_anc "ancillary service costs"
label var cost_sp "special purpose costs"
label var cost_oth "other reimbursable costs"
*/



order ///
	pn year ///
	beds_adultped_wtd beds_totadultped_wtd beds_total_* ///
	availbeddays_adultped ipbeddays_adultped ipdischarges_adultped ///
	income totcost margin ///
	uccare_chg_harmonized uccare_cost_harmonized ///
	netpatrev othinc opexp othexp donations invinc ///
	iphosprev ipgenrev ipicrev iprcrev ipancrev ipoprev iptotrev ///
	ipcharge opcharge opancrev opoprev optotrev ///
	tottotrev ///
	ccr_* chguccare totinitchcare ppaychcare nonmcbaddebt costuccare_v2010 ///
	nreports nfmt96 nfmt10 nno_uncomp ///
	frac_year_covered covg_begin_dt covg_end_dt flag_short flag_long ///
	ppamount totampayable totdischarge medicare_discharge cost_* gen_* capinv_tot

compress

*save ouse , replace
export delimited "`fpath_output'/hcris_merged_hospyear.csv", replace
save "`fpath_output'/hcris_merged_hospyear", replace 
log close

quietly {
    log using "`fpath_output'/hcris_merged_hospyear_codebook.txt", text replace
    noisily describe, fullnames
    log close
}

log using "`fpath_output'/hcris.log", append

log close
